***** Clean Repair Data


clear 
clear matrix
set memory 1000m
set more off
cap log close


cd "/Users/..."

global do_file="‎⁨/Users/.../do_file"
global log_file="/Users/.../log_file"
global raw_data="/Users/.../raw_data⁩⁩"
global working_data="/Users/.../working_data"
global results="/Users/.../results"



use raw_data/repair_raw.dta, clear

replace var5="MAZDA6后视摄像头（前装用品）" if _n==6732
replace var6="17198" if _n==6732
drop if _n==6733

list if var7~=.

replace var5="发动机支座总成;变速器前支座总成;变速器后支座总成;变速器支座;R前下控制臂总成(带衬套);胎压传感器;节流阀清洗剂;燃油系统清洗套餐;外加工螺丝攻牙" if _n==9564
replace var6="62457" if _n==9564
drop if _n==9565

tab var7, m
drop var7

rename var1 vin
rename var2 repair_date
rename var3 repair_type
rename var4 repair_content
rename var5 repair_material
rename var6 mileage

replace vin=vin[_n-1] if vin=="" & vin[_n-1]~=""

tab repair_type, m

replace repair_type="" if repair_type=="-"


* repair_type_new=1, maintenance
gen temp_pos1=strpos(repair_type, "保养")
gen temp_pos2=strpos(repair_type, "BY")
gen temp_pos3=strpos(repair_type, "SHOUBAO")
gen temp_pos4=strpos(repair_type, "首保")
gen temp_pos5=strpos(repair_type, "公里")
gen temp_pos6=strpos(repair_type, "定保")
gen temp_pos7=strpos(repair_type, "常规保")
gen temp_pos8=strpos(repair_type, "强保")
gen temp_pos9=strpos(repair_type, "免保")
gen temp_pos10=strpos(repair_type, "一保")
gen temp_pos11=strpos(repair_type, "二保")
gen temp_pos12=strpos(repair_type, "三保")
gen temp_pos13=strpos(repair_type, "四保")



egen temp_total=rowtotal(temp_pos1-temp_pos13)
gen repair_type_new=1 if temp_total>0
drop temp_pos1-temp_pos13 temp_total

* repair_type_new=2, clean 
gen temp_pos=strpos(repair_type, "清洗")
replace repair_type_new=2 if temp_pos>0
drop temp_pos

* repair_type_new=3, check
gen temp_pos1=strpos(repair_type, "检查")
gen temp_pos2=strpos(repair_type, "定检")
gen temp_pos3=strpos(repair_type, "检测")
gen temp_pos4=strpos(repair_type, "年审")
gen temp_pos5=strpos(repair_type, "PDI")
gen temp_pos6=strpos(repair_type, "INSPECTION") 
gen temp_pos7=strpos(repair_type, "体检") 
gen temp_pos8=strpos(repair_type, "免检") 
gen temp_pos9=strpos(repair_type, "冬检") 
gen temp_pos10=strpos(repair_type, "专检") 
gen temp_pos11=strpos(repair_type, "普检") 


egen temp_total=rowtotal(temp_pos1-temp_pos11)
replace repair_type_new=3 if temp_total>0
drop temp_pos1-temp_pos11 temp_total

* repair_type_new=4, repair
gen temp_pos1=strpos(repair_type, "更换")
gen temp_pos2=strpos(repair_type, "修复")
gen temp_pos3=strpos(repair_type, "拆卸")
gen temp_pos4=strpos(repair_type, "安装")
gen temp_pos5=strpos(repair_type, "修理")
gen temp_pos6=strpos(repair_type, "维修")
gen temp_pos7=strpos(repair_type, "报修")
gen temp_pos8=strpos(repair_type, "返修")
gen temp_pos9=strpos(repair_type, "内修")
gen temp_pos10=strpos(repair_type, "拆装")
gen temp_pos11=strpos(repair_type, "换")
gen temp_pos12=strpos(repair_type, "改装")
gen temp_pos13=strpos(repair_type, "小修")
gen temp_pos14=strpos(repair_type, "大修")
gen temp_pos15=strpos(repair_type, "保修")
gen temp_pos16=strpos(repair_type, "维护")
gen temp_pos17=strpos(repair_type, "拆解")
gen temp_pos18=strpos(repair_type, "矫正")
gen temp_pos19=strpos(repair_type, "受损")
gen temp_pos20=strpos(repair_type, "故障")
gen temp_pos21=strpos(repair_type, "召回")
gen temp_pos22=strpos(repair_type, "Recall and Technical Action")
gen temp_pos23=strpos(repair_type, "检修")




egen temp_total=rowtotal(temp_pos1-temp_pos23)
replace repair_type_new=4 if temp_total>0
drop temp_pos1-temp_pos23 temp_total


* repair_type_new=5, beauty
gen temp_pos1=strpos(repair_type, "喷漆")
gen temp_pos2=strpos(repair_type, "烤漆")
gen temp_pos3=strpos(repair_type, "全喷")
gen temp_pos4=strpos(repair_type, "做漆")
gen temp_pos5=strpos(repair_type, "划痕")
gen temp_pos6=strpos(repair_type, "整形")
gen temp_pos7=strpos(repair_type, "补漆")
gen temp_pos8=strpos(repair_type, "美容")
gen temp_pos9=strpos(repair_type, "油漆")
gen temp_pos10=strpos(repair_type, "打蜡")
gen temp_pos11=strpos(repair_type, "钣喷")
gen temp_pos12=strpos(repair_type, "抛光")
gen temp_pos13=strpos(repair_type, "装潢")


egen temp_total=rowtotal(temp_pos1-temp_pos13)
replace repair_type_new=5 if temp_total>0
drop temp_pos1-temp_pos13 temp_total 


* repair_type_new=6, accident
gen temp_pos1=strpos(repair_type, "事故")
gen temp_pos2=strpos(repair_type, "accident car")
gen temp_pos3=strpos(repair_type, "保险")
gen temp_pos4=strpos(repair_type, "理赔")
gen temp_pos5=strpos(repair_type, "人保")
gen temp_pos6=strpos(repair_type, "索赔")
gen temp_pos7=strpos(repair_type, "直赔")
gen temp_pos8=strpos(repair_type, "中保")
gen temp_pos9=strpos(repair_type, "出险")
gen temp_pos10=strpos(repair_type, "平安")
gen temp_pos11=strpos(repair_type, "碰撞")
gen temp_pos12=strpos(repair_type, "三者")
gen temp_pos13=strpos(repair_type, "太保")
gen temp_pos14=strpos(repair_type, "太平洋")



egen temp_total=rowtotal(temp_pos1-temp_pos14)
replace repair_type_new=6 if temp_total>0
drop temp_pos1-temp_pos14 temp_total




tab repair_type if repair_type_new==., m

* repair_type_new=7, others
replace repair_type_new=7 if repair_type_new==. & repair_type~=""

tab repair_type_new, m


**************************************
* Get missing type from repair content
**************************************

count if repair_content~="" & repair_type_new==.
tab repair_content if repair_type_new==.




* repair_type_new=1, maintenance
gen temp_pos1=strpos(repair_content, "保养") if repair_type_new==.
gen temp_pos2=strpos(repair_content, "BY")   if repair_type_new==.
gen temp_pos3=strpos(repair_content, "SHOUBAO") if repair_type_new==.
gen temp_pos4=strpos(repair_content, "首保") if repair_type_new==.
gen temp_pos5=strpos(repair_content, "公里") if repair_type_new==.
gen temp_pos6=strpos(repair_content, "定保") if repair_type_new==.
gen temp_pos7=strpos(repair_content, "常规保") if repair_type_new==.
gen temp_pos8=strpos(repair_content, "强保") if repair_type_new==.
gen temp_pos9=strpos(repair_content, "免保") if repair_type_new==.
gen temp_pos10=strpos(repair_content, "一保") if repair_type_new==.
gen temp_pos11=strpos(repair_content, "二保") if repair_type_new==.
gen temp_pos12=strpos(repair_content, "三保") if repair_type_new==.
gen temp_pos13=strpos(repair_content, "四保") if repair_type_new==.



egen temp_total=rowtotal(temp_pos1-temp_pos13)
replace repair_type_new=1 if temp_total>0 & temp_total~=.
drop temp_pos1-temp_pos13 temp_total

tab repair_type_new, m

* repair_type_new=2, clean 
gen temp_pos=strpos(repair_content, "清洗") if repair_type_new==.
replace repair_type_new=2 if temp_pos>0 & temp_pos~=.
drop temp_pos

* repair_type_new=3, check
gen temp_pos1=strpos(repair_content, "检查") if repair_type_new==.
gen temp_pos2=strpos(repair_content, "定检") if repair_type_new==.
gen temp_pos3=strpos(repair_content, "检测") if repair_type_new==.
gen temp_pos4=strpos(repair_content, "年审") if repair_type_new==.
gen temp_pos5=strpos(repair_content, "PDI") if repair_type_new==.
gen temp_pos6=strpos(repair_content, "INSPECTION") if repair_type_new==.
gen temp_pos7=strpos(repair_content, "体检") if repair_type_new==.
gen temp_pos8=strpos(repair_content, "免检") if repair_type_new==.
gen temp_pos9=strpos(repair_content, "冬检") if repair_type_new==.
gen temp_pos10=strpos(repair_content, "专检") if repair_type_new==.
gen temp_pos11=strpos(repair_content, "普检") if repair_type_new==.


egen temp_total=rowtotal(temp_pos1-temp_pos11)
replace repair_type_new=3 if temp_total>0 & temp_total~=.
drop temp_pos1-temp_pos11 temp_total

tab repair_type_new, m

* repair_type_new=4, repair
gen temp_pos1=strpos(repair_content, "更换") if repair_type_new==.
gen temp_pos2=strpos(repair_content, "修复") if repair_type_new==.
gen temp_pos3=strpos(repair_content, "拆卸") if repair_type_new==.
gen temp_pos4=strpos(repair_content, "安装") if repair_type_new==.
gen temp_pos5=strpos(repair_content, "修理") if repair_type_new==.
gen temp_pos6=strpos(repair_content, "维修") if repair_type_new==.
gen temp_pos7=strpos(repair_content, "报修") if repair_type_new==.
gen temp_pos8=strpos(repair_content, "返修") if repair_type_new==.
gen temp_pos9=strpos(repair_content, "内修") if repair_type_new==.
gen temp_pos10=strpos(repair_content, "拆装") if repair_type_new==.
gen temp_pos11=strpos(repair_content, "换") if repair_type_new==.
gen temp_pos12=strpos(repair_content, "改装") if repair_type_new==.
gen temp_pos13=strpos(repair_content, "小修") if repair_type_new==.
gen temp_pos14=strpos(repair_content, "大修") if repair_type_new==.
gen temp_pos15=strpos(repair_content, "保修") if repair_type_new==.
gen temp_pos16=strpos(repair_content, "维护") if repair_type_new==.
gen temp_pos17=strpos(repair_content, "拆解") if repair_type_new==.
gen temp_pos18=strpos(repair_content, "矫正") if repair_type_new==.
gen temp_pos19=strpos(repair_content, "受损") if repair_type_new==.
gen temp_pos20=strpos(repair_content, "故障") if repair_type_new==.
gen temp_pos21=strpos(repair_content, "召回") if repair_type_new==.
gen temp_pos22=strpos(repair_content, "Recall and Technical Action") if repair_type_new==.
gen temp_pos23=strpos(repair_content, "检修") if repair_type_new==.
gen temp_pos24=strpos(repair_content, "Replace") if repair_type_new==.




egen temp_total=rowtotal(temp_pos1-temp_pos24)
replace repair_type_new=4 if temp_total>0 & temp_total~=.
drop temp_pos1-temp_pos24 temp_total


* repair_type_new=5, beauty
gen temp_pos1=strpos(repair_content, "喷漆") if repair_type_new==.
gen temp_pos2=strpos(repair_content, "烤漆") if repair_type_new==.
gen temp_pos3=strpos(repair_content, "全喷") if repair_type_new==.
gen temp_pos4=strpos(repair_content, "做漆") if repair_type_new==.
gen temp_pos5=strpos(repair_content, "划痕") if repair_type_new==.
gen temp_pos6=strpos(repair_content, "整形") if repair_type_new==.
gen temp_pos7=strpos(repair_content, "补漆") if repair_type_new==.
gen temp_pos8=strpos(repair_content, "美容") if repair_type_new==.
gen temp_pos9=strpos(repair_content, "油漆") if repair_type_new==.
gen temp_pos10=strpos(repair_content, "打蜡") if repair_type_new==.
gen temp_pos11=strpos(repair_content, "钣喷") if repair_type_new==.
gen temp_pos12=strpos(repair_content, "抛光") if repair_type_new==.
gen temp_pos13=strpos(repair_content, "装潢") if repair_type_new==.


egen temp_total=rowtotal(temp_pos1-temp_pos13)
replace repair_type_new=5 if temp_total>0 & temp_total~=.
drop temp_pos1-temp_pos13 temp_total 


* repair_type_new=6, accident
gen temp_pos1=strpos(repair_content, "事故") if repair_type_new==.
gen temp_pos2=strpos(repair_content, "accident car") if repair_type_new==.
gen temp_pos3=strpos(repair_content, "保险") if repair_type_new==.
gen temp_pos4=strpos(repair_content, "理赔") if repair_type_new==.
gen temp_pos5=strpos(repair_content, "人保") if repair_type_new==.
gen temp_pos6=strpos(repair_content, "索赔") if repair_type_new==.
gen temp_pos7=strpos(repair_content, "直赔") if repair_type_new==.
gen temp_pos8=strpos(repair_content, "中保") if repair_type_new==.
gen temp_pos9=strpos(repair_content, "出险") if repair_type_new==.
gen temp_pos10=strpos(repair_content, "平安") if repair_type_new==.
gen temp_pos11=strpos(repair_content, "碰撞") if repair_type_new==.
gen temp_pos12=strpos(repair_content, "三者") if repair_type_new==.
gen temp_pos13=strpos(repair_content, "太保") if repair_type_new==.
gen temp_pos14=strpos(repair_content, "太平洋") if repair_type_new==.


egen temp_total=rowtotal(temp_pos1-temp_pos14)
replace repair_type_new=6 if temp_total>0 & temp_total~=.
drop temp_pos1-temp_pos14 temp_total




tab repair_content if repair_type_new==., m

* repair_type_new=7, others
replace repair_type_new=7 if repair_type_new==. & repair_content~=""

tab repair_type_new, m





* clean mileage
destring mileage, force gen(mileage_new)

list repair_date mileage if mileage_new==.

gen temp_pos=strpos(mileage, "Km")
gen temp=substr(mileage, 1, temp_pos-1) if temp_pos>0
destring temp, force replace
replace mileage_new=temp if temp~=.

drop temp_pos temp 
drop mileage

rename mileage_new mileage

gen calendar_day=repair_date

count if repair_date==""
drop if repair_date==""

* each day may have multiple repairs
bysort vin calendar_day: gen temp=_N
tab temp, m
drop temp



save working_data/repair_clean_070319.dta, replace



**********************************************************
* Further Clean: Aggregate to day level, no repair deatils
**********************************************************


use working_data/repair_clean_070319.dta, clear

* clean formats for calendar_dates
gen temp_length=length(calendar_day)
tab temp_length, m

tab calendar_day if temp_length==6
gen temp=substr(calendar_day, 1, 1) if temp_length==6  // month
gen temp1=substr(calendar_day, 3, 1) if temp_length==6 // day
gen temp2=substr(calendar_day, 5, 2) if temp_length==6 // year

replace calendar_day="20"+temp2+"-0"+temp+"-0"+temp1 if temp_length==6
replace repair_date=calendar_day if temp_length==6
drop temp temp1 temp2

tab calendar_day if temp_length==7
gen temp=substr(calendar_day, 1, 1) if temp_length==7  // month
gen temp1=substr(calendar_day, 3, 2) if temp_length==7 // day
gen temp2=substr(calendar_day, 6, 2) if temp_length==7 // year

replace calendar_day="20"+temp2+"-0"+temp+"-"+temp1 if temp_length==7
replace repair_date=calendar_day if temp_length==7
drop temp temp1 temp2

tab calendar_day if temp_length==8
gen temp=substr(calendar_day, 1, 2) if temp_length==8  // month
gen temp1=substr(calendar_day, 4, 2) if temp_length==8 // day
gen temp2=substr(calendar_day, 7, 2) if temp_length==8 // year

replace calendar_day="20"+temp2+"-"+temp+"-"+temp1 if temp_length==8
replace repair_date=calendar_day if temp_length==8
drop temp temp1 temp2

drop temp_length


tab repair_type_new, m
drop if repair_type_new==.

bysort vin repair_date: egen temp_max=max(mileage)
replace mileage=temp_max if temp_max~=.
drop temp_max

sort vin repair_date repair_type_new

bysort vin repair_date repair_type_new: gen count=_N
bysort vin repair_date repair_type_new: keep if _n==1

drop repair_type repair_content repair_material

reshape wide count, i(vin repair_date) j(repair_type_new)

forvalues i=1(1)7{
	replace count`i'=0 if count`i'==.
}

rename count1 count_maintenance
rename count2 count_clean
rename count3 count_check
rename count4 count_repair
rename count5 count_beauty
rename count6 count_accident
rename count7 count_others


save working_data/repair_clean_further_070319.dta, replace





